Overview

> project_summary = "/Users/rory/cache/fabio-splicing/new-samples/2016-07-29_fabio-newlines/project-summary.csv"
> counts_file = "/Users/rory/cache/fabio-splicing/new-samples/2016-07-29_fabio-newlines/combined.counts"
> tx2genes_file = "/Users/rory/cache/fabio-splicing/new-samples/2016-07-29_fabio-newlines/tx2gene.csv"
> cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", 
+     "#D55E00", "#CC79A7")
> summarydata = read.table(project_summary, header = TRUE, sep = ",")
> summarydata = summarydata[, colSums(is.na(summarydata)) < nrow(summarydata)]
> # handle newer bcbio-nextgen runs that use description as the key
> if ("description" %in% colnames(summarydata)) {
+     rownames(summarydata) = summarydata$description
+     summarydata$Name = rownames(summarydata)
+     summarydata$description = NULL
+ } else {
+     rownames(summarydata) = summarydata$Name
+     # summarydata$Name = NULL
+ }
> summarydata = summarydata[order(rownames(summarydata)), ]
> if (file.exists(tx2genes_file)) {
+     sample_dirs = file.path(dirname(project_summary), "..", rownames(summarydata))
+     salmon_files = file.path(sample_dirs, "salmon", "quant.sf")
+     sailfish_files = file.path(sample_dirs, "sailfish", "quant.sf")
+     new_sailfish = file.path(sample_dirs, "sailfish", "quant", "quant.sf")
+     new_salmon = file.path(sample_dirs, "salmon", "quant", "quant.sf")
+     if (file.exists(salmon_files[1])) {
+         sf_files = salmon_files
+     } else if (file.exists(sailfish_files[1])) {
+         sf_files = sailfish_files
+     } else if (file.exists(new_sailfish[1])) {
+         sf_files = new_sailfish
+     } else if (file.exists(new_salmon[1])) {
+         sf_files = new_salmon
+     }
+     names(sf_files) = rownames(summarydata)
+     tx2gene = read.table(tx2genes_file, sep = ",", row.names = NULL, header = FALSE)
+     txi.salmon = tximport(sf_files, type = "salmon", tx2gene = tx2gene, reader = readr::read_tsv, 
+         countsFromAbundance = "lengthScaledTPM")
+     counts = round(data.frame(txi.salmon$counts, check.names = FALSE))
+ } else {
+     counts = read.table(counts_file, header = TRUE, row.names = "id", check.names = FALSE)
+ }
> counts = counts[, order(colnames(counts)), drop = FALSE]
> colnames(counts) = gsub(".counts", "", colnames(counts))
> 
> # this is a list of all non user-supplied metadata columns that could appear
> known_columns = c("Name", "X.GC", "Exonic.Rate", "Sequences.flagged.as.poor.quality", 
+     "rRNA_rate", "Fragment.Length.Mean", "Intronic.Rate", "Intergenic.Rate", 
+     "Mapping.Rate", "Quality.format", "Duplication.Rate.of.Mapped", "Mapped", 
+     "rRNA", "Sequence.length", "Transcripts.Detected", "Mean.Per.Base.Cov.", 
+     "Genes.Detected", "Unique.Starts.Per.Read", "unique_starts_per_read", "complexity", 
+     "X5.3.bias", "Duplicates.pct", "Duplicates", "Mapped.reads", "Average.insert.size", 
+     "Mapped.reads.pct", "Total.reads", "avg_coverage_per_region", "Mapped.Reads")
> summarydata[, "Fragment.Length.Mean"] = summarydata$Average.insert.size
> metadata = summarydata[, !colnames(summarydata) %in% known_columns, drop = FALSE]
> metadata = metadata[, colSums(is.na(metadata)) < nrow(metadata), drop = FALSE]
> metadata$treatment = relevel(metadata$treatment, ref = "Ctrl")
> metadata$bort = relevel(metadata$bort, ref = "N")
> metadata$e7107 = relevel(metadata$e7107, ref = "N")
> mart <- biomaRt::useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
> t2g <- biomaRt::getBM(attributes = c("ensembl_transcript_id", "ensembl_gene_id", 
+     "external_gene_name"), mart = mart)
> t2g <- dplyr::rename(t2g, target_id = ensembl_transcript_id, ens_gene = ensembl_gene_id, 
+     ext_gene = external_gene_name)
> symbols = unique(t2g[, c("ens_gene", "ext_gene")])
> sanitize_datatable = function(df, ...) {
+     # remove dashes which cause wrapping
+     DT::datatable(df, ..., rownames = gsub("-", "_", rownames(df)), colnames = gsub("-", 
+         "_", colnames(df)))
+ }
> # set seed for reproducibility
> set.seed(1454944673)

Sample metadata

> get_heatmap_fn = function(summarydata) {
+     # return the pheatmap function with or without metadata
+     if (ncol(metadata) == 0) {
+         return(pheatmap)
+     } else {
+         # rownames(metadata) = summarydata$Name
+         heatmap_fn = function(data, ...) {
+             pheatmap(data, annotation = metadata, clustering_method = "ward.D2", 
+                 clustering_distance_cols = "correlation", ...)
+         }
+         return(heatmap_fn)
+     }
+ }
> heatmap_fn = get_heatmap_fn(summarydata)

Quality control metrics

> qualimap_run = "Mapped" %in% colnames(summarydata)
> do_quality = "Mapped.reads" %in% colnames(summarydata)

Mapped reads

Overall mapped reads looks good.

> ggplot(summarydata, aes(x = Name, y = Mapped)) + theme_bw(base_size = 10) + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) + 
+     geom_bar(stat = "identity") + ylab("mapped reads") + xlab("")

> ggplot(summarydata, aes(x = Name, y = Mapped.reads)) + theme_bw(base_size = 10) + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) + 
+     geom_bar(stat = "identity") + ylab("mapped reads") + xlab("")

Genomic mapping rate

Genomic mapping rate looks great.

> ggplot(summarydata, aes(x = Name, y = Mapping.Rate)) + geom_bar(stat = "identity") + 
+     ylab("mapping rate") + xlab("") + theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90))

> ggplot(summarydata, aes(x = Name, y = Mapped.reads.pct)) + geom_bar(stat = "identity") + 
+     ylab("mapping rate") + xlab("") + theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90))

Number of genes detected

Good number of genes detected.

> dd = data.frame(Name = colnames(counts), Genes.Detected = colSums(counts > 0))
> ggplot(dd, aes(x = Name, y = Genes.Detected)) + geom_bar(stat = "identity") + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("genes detected") + 
+     xlab("")

Gene detection saturation

This plot looks a little off, but it is the way the axis are. It is fine.

> col_mapped = ifelse(qualimap_run, "Mapped", "Mapped.reads")
> dd = data.frame(Mapped = summarydata[, col_mapped], Genes.Detected = colSums(counts > 
+     0))
> ggplot(dd, aes(x = Mapped, y = Genes.Detected)) + geom_point() + theme_bw(base_size = 10) + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) + 
+     ylab("genes detected") + xlab("reads mapped")

Exonic mapping rate

Here we are starting to see some issues that might be batch effects. Ctrl and Bort1 samples have slightly higher exonic mapping rate.

> ggplot(summarydata, aes(x = Name, y = Exonic.Rate)) + geom_bar(stat = "identity") + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("exonic mapping rate") + 
+     xlab("")

rRNA mapping rate

They also have a lower rRNA rate:

> eval_rRNA = "rRNA_rate" %in% colnames(summarydata) & !sum(is.na(summarydata$rRNA_rate)) == 
+     nrow(summarydata)
> ggplot(summarydata, aes(x = Name, y = rRNA_rate)) + geom_bar(stat = "identity") + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("rRNA rate") + 
+     xlab("")

Estimated fragment length of paired-end reads

and a higher estimated fragment size

> ggplot(summarydata, aes(x = Name, y = Fragment.Length.Mean)) + geom_bar(stat = "identity") + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("fragment length") + 
+     xlab("")

5’->3’ bias

> ggplot(summarydata, aes(x = Name, y = X5.3.bias)) + geom_bar(stat = "identity") + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("5'->3' bias") + 
+     xlab("")

Boxplot of log10 counts per gene

> melted = melt(counts)
> colnames(melted) = c("sample", "count")
> melted$sample = factor(melted$sample)
> melted = melted[order(melted$sample), ]
> melted$count = log(melted$count)
> ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme_bw(base_size = 10) + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) + 
+     xlab("")

Boxplot of log10 TMM-normalized counts per gene

Trimmed mean of M-values (TMM) normalization is described here

Robinson, M. D., & Oshlack, A. (2010). A scaling normalization method for differential expression analysis of RNA-seq data. Genome Biology, 11(3). doi:10.1186/gb-2010-11-3-r25

> y = DGEList(counts = counts)
> y = calcNormFactors(y)
> normalized_counts = cpm(y, normalized.lib.sizes = TRUE)
> melted = melt(normalized_counts)
> colnames(melted) = c("gene", "sample", "count")
> melted$sample = factor(melted$sample)
> melted = melted[order(melted$sample), ]
> melted$count = log(melted$count)
> ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme_bw(base_size = 10) + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) + 
+     xlab("")

Density of log10 TMM-normalized counts

> ggplot(melted, aes(x = count, group = sample)) + geom_density() + theme_bw(base_size = 10) + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) + 
+     xlab("")

Correlation heatmap of TMM-normalized counts

The control and bort samples are more similar to each other than the other samples. We won’t be able to tell if that is due to real differences or some of the batch differences we saw above.

Correlation (Pearson)

> heatmap_fn(cor(normalized_counts, method = "pearson"))

Correlation (Spearman)

> heatmap_fn(cor(normalized_counts, method = "spearman"))

PCA plots

We can see the same effects on the PCA plot. The different treatments separate out the cells, bort + control are more similar to each other and E7107 + BE are more similar to each other. It looks like the E7107 treatment is the strongest effect as most of the variance is explained by whether or not the samples have been treated with E7107.

> dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata, design = ~Name)
> vst = varianceStabilizingTransformation(dds)
> pca_loadings = function(object, ntop = 500) {
+     rv <- matrixStats::rowVars(assay(object))
+     select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
+     pca <- prcomp(t(assay(object)[select, ]))
+     percentVar <- pca$sdev^2/sum(pca$sdev^2)
+     names(percentVar) = colnames(pca$x)
+     pca$percentVar = percentVar
+     return(pca)
+ }
> pc = pca_loadings(vst)
> comps = data.frame(pc$x)
> comps$Name = rownames(comps)
> library(dplyr)
> comps = comps %>% left_join(summarydata, by = c(Name = "Name"))
> colorby = "treatment"
> pca_plot = function(comps, nc1, nc2, colorby) {
+     c1str = paste0("PC", nc1)
+     c2str = paste0("PC", nc2)
+     ggplot(comps, aes_string(c1str, c2str, color = colorby)) + geom_point() + 
+         theme_bw() + xlab(paste0(c1str, ": ", round(pc$percentVar[nc1] * 100), 
+         "% variance")) + ylab(paste0(c2str, ": ", round(pc$percentVar[nc2] * 
+         100), "% variance"))
+ }

PC1 vs. PC2

> pca_plot(comps, 1, 2, colorby)

PC3 vs. PC4

> pca_plot(comps, 3, 4, colorby)

PC5 vs. PC6

> pca_plot(comps, 5, 6, colorby)

Variance explained by component

> ggplot(data.frame(component = reorder(names(pc$percentVar), -pc$percentVar), 
+     percent_var = pc$percentVar), aes(component, percent_var)) + geom_bar(stat = "identity") + 
+     ylab("percent of total variation") + xlab("") + theme_bw()

> # snagged from development version of DESeq
> DESeqDataSetFromTximport <- function(txi, colData, design, ...) {
+     counts <- round(txi$counts)
+     mode(counts) <- "integer"
+     dds <- DESeqDataSetFromMatrix(countData = counts, colData = colData, design = design, 
+         ...)
+     stopifnot(txi$countsFromAbundance %in% c("no", "scaledTPM", "lengthScaledTPM"))
+     if (txi$countsFromAbundance %in% c("scaledTPM", "lengthScaledTPM")) {
+         message("using length scaled TPM counts from tximport")
+     } else {
+         message("using counts and average transcript lengths from tximport")
+         lengths <- txi$length
+         dimnames(lengths) <- dimnames(dds)
+         assays(dds)[["avgTxLength"]] <- lengths
+     }
+     return(dds)
+ }
> 
> subset_tximport = function(txi, rows, columns) {
+     txi$counts = txi$counts[rows, columns]
+     txi$abundance = txi$abundance[rows, columns]
+     txi$length = txi$length[rows, columns]
+     return(txi)
+ }
> library(DEGreport)
> library(vsn)
> design = ~treatment
> condition = "treatment"

Differential expression

> counts <- counts[rowSums(counts > 0) > 1, ]
> if (exists("txi.salmon")) {
+     txi.salmon = subset_tximport(txi.salmon, rownames(counts), colnames(counts))
+     dds = DESeqDataSetFromTximport(txi.salmon, colData = summarydata, design = design)
+ } else {
+     dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata, 
+         design = design)
+ }
> geoMeans = apply(counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row != 
+     0]))))
> dds = estimateSizeFactors(dds, geoMeans = geoMeans)
> dds = DESeq(dds)

Effect of variance stabilization

> par(mfrow = c(1, 3))
> notAllZero <- (rowSums(counts(dds)) > 0)
> rld <- rlog(dds)
> vsd <- varianceStabilizingTransformation(dds)
> rlogMat <- assay(rld)
> vstMat <- assay(vsd)
> 
> meanSdPlot(log2(counts(dds, normalized = TRUE)[notAllZero, ] + 1))

> meanSdPlot(assay(rld[notAllZero, ]))

> meanSdPlot(assay(vsd[notAllZero, ]))

Dispersion estimates

The dispersion estimates are super low, I guess because it is cell lines? Just to be clear, these are biological replicates right? Different replicates of the same treatment were done on different days to different batches?

replicates;

> plotDispEsts(dds)

> e7107 = results(dds, contrast = c("treatment", "E7107", "Ctrl"))
> bort = results(dds, contrast = c("treatment", "Bort", "Ctrl"))
> be = results(dds, contrast = c("treatment", "BE", "Ctrl"))
> plotMA_full = function(res) {
+     ymax = max(res$log2FoldChange, na.rm = TRUE)
+     ymin = min(res$log2FoldChange, na.rm = TRUE)
+     plotMA(res, ylim = c(ymin, ymax))
+ }
> volcano = function(res) {
+     stats = as.data.frame(res[, c(2, 6)])
+     p = volcano_density_plot(stats, lfc.cutoff = 1.5)
+     print(p)
+ }
> write_deseq = function(res, fname) {
+     out_df = as.data.frame(res)
+     out_df$id = rownames(out_df)
+     out_df = out_df[, c("id", colnames(out_df)[colnames(out_df) != "id"])]
+     out_df = out_df %>% left_join(symbols, by = c(id = "ens_gene"))
+     write.table(out_df, file = fname, quote = FALSE, sep = ",", row.names = FALSE, 
+         col.names = TRUE)
+ }

Bort vs control

> plotMA(bort)

> volcano(bort)

TableGrob (2 x 2) "arrange": 3 grobs
  z     cells    name                grob
1 1 (2-2,1-1) arrange     gtable[arrange]
2 2 (2-2,2-2) arrange     gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.513]
> write_deseq(bort, "control-vs-bort-deseq2.csv")

E7107 vs control

> plotMA(e7107)

> volcano(e7107)

TableGrob (2 x 2) "arrange": 3 grobs
  z     cells    name                grob
1 1 (2-2,1-1) arrange     gtable[arrange]
2 2 (2-2,2-2) arrange     gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.681]
> write_deseq(e7107, "control-vs-e7107-deseq2.csv")

BE vs control

> plotMA(be)

> volcano(be)

TableGrob (2 x 2) "arrange": 3 grobs
  z     cells    name                grob
1 1 (2-2,1-1) arrange     gtable[arrange]
2 2 (2-2,2-2) arrange     gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.849]
> write_deseq(be, "control-vs-be-deseq2.csv")

sleuth

> library(dplyr)
> library(sleuth)
> sf_dirs = file.path("..", "..", rownames(summarydata), "sailfish", "quant")
> names(sf_dirs) = rownames(summarydata)
> sfdata = metadata
> sfdata$sample = rownames(sfdata)
> sfdata$path = sf_dirs
> mart <- biomaRt::useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
> t2g <- biomaRt::getBM(attributes = c("ensembl_transcript_id", "ensembl_gene_id", 
+     "external_gene_name"), mart = mart)
> t2g <- dplyr::rename(t2g, target_id = ensembl_transcript_id, ens_gene = ensembl_gene_id, 
+     ext_gene = external_gene_name)

Control vs BE

> library(biomaRt)
> so = sleuth_prep(sfdata, design, target_mapping = t2g) %>% sleuth_fit()
> models(so)
> save(so, file = "sleuth_models.RData")
> rm(so)
> load("sleuth_models.RData")
> so = sleuth_wt(so, "treatmentBE")
> so = sleuth_wt(so, "treatmentBort")
> so = sleuth_wt(so, "treatmentE7107")
> bort_tab = sleuth_results(so, "treatmentBort", show_all = TRUE)
> e7107_tab = sleuth_results(so, "treatmentE7107", show_all = TRUE)
> be_tab = sleuth_results(so, "treatmentBE", show_all = TRUE)
> reciprocal_sleuth = function(tab) {
+     tab %>% na.omit() %>% group_by(ens_gene) %>% mutate(reciprocal = (sum(b > 
+         0 & qval < 0.05) > 0) & sum(b < 0 & qval < 0.05) > 0 & length(ens_gene) > 
+         1) %>% as.data.frame()
+ }
> bort_tab = reciprocal_sleuth(bort_tab)
> e7107_tab = reciprocal_sleuth(e7107_tab)
> be_tab = reciprocal_sleuth(be_tab)
> write.table(bort_tab, "control-vs-bort-sleuth.csv", quote = FALSE, row.names = FALSE, 
+     col.names = TRUE, sep = ",")
> write.table(e7107_tab, "control-vs-e7107-sleuth.csv", quote = FALSE, row.names = FALSE, 
+     col.names = TRUE, sep = ",")
> write.table(be_tab, "control-vs-be-sleuth.csv", quote = FALSE, row.names = FALSE, 
+     col.names = TRUE, sep = ",")

Transcript MA plots

> library(cowplot)
> sleuth_MA = function(results) {
+     results$DE = results$qval < 0.05
+     ggplot(results, aes(mean_obs, b, color = DE)) + geom_point(size = 0.5) + 
+         guides(color = FALSE) + scale_color_manual(values = c("black", "red")) + 
+         xlab("mean expression") + ylab(expression("<U+0392>")) + scale_x_log10()
+ }

Control vs Bort

> sleuth_MA(bort_tab)

Control vs E7107

> sleuth_MA(e7107_tab)

Control vs BE

> sleuth_MA(be_tab)

Pathway analysis

> orgdb = "org.Hs.eg.db"
> biomart_dataset = "hsapiens_gene_ensembl"
> keggname = "hsa"
> library(dplyr)
> library(clusterProfiler)
> library(orgdb, character.only = TRUE)
> library(biomaRt)
> mart = biomaRt::useMart(biomart = "ensembl", dataset = biomart_dataset)
> entrez = biomaRt::getBM(attributes = c("ensembl_gene_id", "entrezgene"), mart = mart)
> entrez$entrezgene = as.character(entrez$entrezgene)
> summarize_cp = function(res, comparison) {
+     summaries = data.frame()
+     for (ont in names(res)) {
+         ontsum = summary(res[[ont]])
+         ontsum$ont = ont
+         summaries = rbind(summaries, ontsum)
+     }
+     summaries$comparison = comparison
+     summaries = summaries %>% arrange(qvalue)
+     return(summaries)
+ }
> 
> enrich_cp = function(res, comparison) {
+     res = data.frame(res)
+     res = res %>% tibble::rownames_to_column() %>% left_join(entrez, by = c(rowname = "ensembl_gene_id")) %>% 
+         filter(!is.na(entrezgene))
+     universe = res$entrezgene
+     genes = subset(res, padj < 0.05)$entrezgene
+     mf = enrichGO(genes, universe = universe, OrgDb = orgdb, ont = "MF", pAdjustMethod = "BH", 
+         qvalueCutoff = 1, pvalueCutoff = 1)
+     cc = enrichGO(genes, universe = universe, OrgDb = orgdb, ont = "CC", pAdjustMethod = "BH", 
+         qvalueCutoff = 1, pvalueCutoff = 1)
+     bp = enrichGO(genes, universe = universe, OrgDb = orgdb, ont = "BP", pAdjustMethod = "BH", 
+         qvalueCutoff = 1, pvalueCutoff = 1)
+     kg = enrichKEGG(gene = genes, universe = universe, organism = "hsa", pvalueCutoff = 1, 
+         qvalueCutoff = 1, pAdjustMethod = "BH")
+     all = list(mf = mf, cc = cc, bp = bp, kg = kg)
+     all[["summary"]] = summarize_cp(all, comparison)
+     return(all)
+ }
> gsea_cp = function(res, comparison) {
+     res = res %>% data.frame() %>% tribble::rownames_to_column() %>% left_join(entrez, 
+         by = c(rowname = "ensembl_gene_id")) %>% filter(!is.na(entrezgene)) %>% 
+         filter(!is.na(log2FoldChange)) %>% filter(!is.na(lfcSE))
+     lfc = data.frame(res)[, "log2FoldChange"]
+     lfcse = data.frame(res)[, "lfcSE"]
+     genes = lfc/lfcse
+     names(genes) = res$entrezgene
+     genes = genes[order(genes, decreasing = TRUE)]
+     cc = gseGO(genes, ont = "CC", OrgDb = orgdb, nPerm = 500, pvalueCutoff = 1, 
+         pAdjustMethod = "BH", verbose = TRUE)
+     mf = gseGO(genes, ont = "MF", OrgDb = orgdb, nPerm = 500, pvalueCutoff = 1, 
+         pAdjustMethod = "BH", verbose = TRUE)
+     bp = gseGO(genes, ont = "bp", OrgDb = orgdb, nPerm = 500, pvalueCutoff = 1, 
+         pAdjustMethod = "BH", verbose = TRUE)
+     genes = data.frame(res)[, "log2FoldChange"]
+     names(genes) = res$entrezgene
+     genes = genes[order(genes, decreasing = TRUE)]
+     genes = genes[!is.na(genes)]
+     kg = gseKEGG(geneList = genes, organism = "mmu", nPerm = 500, pvalueCutoff = 1, 
+         verbose = TRUE)
+     if (orgdb == "org.Hs.eg.db") {
+         do = summary(gseDO(geneList = genes, nPerm = 500, pvalueCutoff = 1, 
+             pAdjustMethod = "BH", verbose = TRUE))
+         do$ont = "DO"
+         all = list(mf = mf, cc = cc, bp = bp, kg = kg, do = do)
+     } else {
+         all = list(mf = mf, cc = cc, bp = bp, kg = kg)
+     }
+     all[["summary"]] = summarize_cp(all, comparison)
+     return(all)
+ }
> write_enrich = function(res, filename) {
+     res = subset(res, qvalue < 0.05)
+     write.table(res, file = filename, quote = FALSE, sep = ",", row.names = FALSE, 
+         col.names = TRUE)
+ }

Enriched pathways

Here we take the set of genes that are flagged as significantly different for each comparison and look for pathways that tend to be enriched for these genes.

Bort vs control

> bort_enrich = enrich_cp(bort, "Bort")
> write_enrich(bort_enrich$summary, "control-bort-pathway.csv")

E7107 vs control

> e7107_enrich = enrich_cp(e7107, "e7107")
> write_enrich(e7107_enrich$summary, "control-e7107-pathway.csv")

BE vs control

> be_enrich = enrich_cp(be, "BE")
> write_enrich(be_enrich$summary, "control-be-pathway.csv")

TPM matrix

> tpm = so$obs_norm %>% dplyr::select(target_id, sample, tpm) %>% tidyr::spread(sample, 
+     tpm)
> library(biomaRt)
> library(dplyr)
> human = useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", 
+     host = "jul2015.archive.ensembl.org")
> symbols = getBM(attributes = c("ensembl_transcript_id", "external_gene_name"), 
+     mart = human)
> tpm = tpm %>% left_join(symbols, by = c(target_id = "ensembl_transcript_id"))
> write.table(tpm, file = "tpm.csv", quote = FALSE, col.names = TRUE, row.names = FALSE, 
+     sep = ",")

Pathway tables

Description of these tables:

GeneRatio = (# genes in this set DE) / (total genes DE)
BgRatio = (# genes expressed in this set) / (total genes expressed)
geneID = Entrez IDs of genes that are DE in this set
ont = ontological term tested (MF=molecular function, CC=cellular compartment,
 BP=biological process, KG=KEGG pathway)

Control vs bort

Control vs e7107

Control vs BE

TPM matrix

TPM matrix